{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d0f27778",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/util_simple_classifier.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "08ec3028",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langdetect import detect\n",
    "from nltk import word_tokenize\n",
    "from nltk.probability import FreqDist\n",
    "from nltk.corpus import stopwords\n",
    "from string import punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "772a62c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label\n",
      "0     the rock is destined to be the 21st century's ...      1\n",
      "1     the gorgeously elaborate continuation of \" the...      1\n",
      "2                        effective but too-tepid biopic      1\n",
      "3     if you sometimes like to go to the movies to h...      1\n",
      "4     emerges as something rare , an issue movie tha...      1\n",
      "...                                                 ...    ...\n",
      "8525  any enjoyment will be hinge from a personal th...      0\n",
      "8526  if legendary shlockmeister ed wood had ever ma...      0\n",
      "8527  hardly a nuanced portrait of a young woman's b...      0\n",
      "8528    interminably bleak , to say nothing of boring .      0\n",
      "8529  things really get weird , though not particula...      0\n",
      "\n",
      "[8530 rows x 2 columns]\n",
      "                                                   text  label\n",
      "0     lovingly photographed in the manner of a golde...      1\n",
      "1                 consistently clever and suspenseful .      1\n",
      "2     it's like a \" big chill \" reunion of the baade...      1\n",
      "3     the story gives ample opportunity for large-sc...      1\n",
      "4                     red dragon \" never cuts corners .      1\n",
      "...                                                 ...    ...\n",
      "1061  a terrible movie that some people will neverth...      0\n",
      "1062  there are many definitions of 'time waster' bu...      0\n",
      "1063  as it stands , crocodile hunter has the hurrie...      0\n",
      "1064  the thing looks like a made-for-home-video qui...      0\n",
      "1065  enigma is well-made , but it's just too dry an...      0\n",
      "\n",
      "[1066 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "(train_df, test_df) = load_train_test_dataset_pd(\"train\", \"test\")\n",
    "print(train_df)\n",
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "947c7320",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label lang\n",
      "0     the rock is destined to be the 21st century's ...      1   en\n",
      "1     the gorgeously elaborate continuation of \" the...      1   en\n",
      "2                        effective but too-tepid biopic      1   en\n",
      "3     if you sometimes like to go to the movies to h...      1   en\n",
      "4     emerges as something rare , an issue movie tha...      1   en\n",
      "...                                                 ...    ...  ...\n",
      "8525  any enjoyment will be hinge from a personal th...      0   en\n",
      "8526  if legendary shlockmeister ed wood had ever ma...      0   en\n",
      "8527  hardly a nuanced portrait of a young woman's b...      0   en\n",
      "8528    interminably bleak , to say nothing of boring .      0   en\n",
      "8529  things really get weird , though not particula...      0   en\n",
      "\n",
      "[8352 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "# Filter out non-English text\n",
    "train_df[\"lang\"] = train_df[\"text\"].apply(detect)\n",
    "train_df = train_df[train_df['lang'] == 'en']\n",
    "print(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c2986708",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label lang\n",
      "0     lovingly photographed in the manner of a golde...      1   en\n",
      "1                 consistently clever and suspenseful .      1   en\n",
      "2     it's like a \" big chill \" reunion of the baade...      1   en\n",
      "3     the story gives ample opportunity for large-sc...      1   en\n",
      "4                     red dragon \" never cuts corners .      1   en\n",
      "...                                                 ...    ...  ...\n",
      "1061  a terrible movie that some people will neverth...      0   en\n",
      "1062  there are many definitions of 'time waster' bu...      0   en\n",
      "1063  as it stands , crocodile hunter has the hurrie...      0   en\n",
      "1064  the thing looks like a made-for-home-video qui...      0   en\n",
      "1065  enigma is well-made , but it's just too dry an...      0   en\n",
      "\n",
      "[1045 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "test_df[\"lang\"] = test_df[\"text\"].apply(detect)\n",
    "test_df = test_df[test_df['lang'] == 'en']\n",
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "82b5bd66",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label lang  \\\n",
      "0     the rock is destined to be the 21st century's ...      1   en   \n",
      "1     the gorgeously elaborate continuation of \" the...      1   en   \n",
      "2                        effective but too-tepid biopic      1   en   \n",
      "3     if you sometimes like to go to the movies to h...      1   en   \n",
      "4     emerges as something rare , an issue movie tha...      1   en   \n",
      "...                                                 ...    ...  ...   \n",
      "8525  any enjoyment will be hinge from a personal th...      0   en   \n",
      "8526  if legendary shlockmeister ed wood had ever ma...      0   en   \n",
      "8527  hardly a nuanced portrait of a young woman's b...      0   en   \n",
      "8528    interminably bleak , to say nothing of boring .      0   en   \n",
      "8529  things really get weird , though not particula...      0   en   \n",
      "\n",
      "                                         tokenized_text  \n",
      "0     [the, rock, is, destined, to, be, the, 21st, c...  \n",
      "1     [the, gorgeously, elaborate, continuation, of,...  \n",
      "2                   [effective, but, too-tepid, biopic]  \n",
      "3     [if, you, sometimes, like, to, go, to, the, mo...  \n",
      "4     [emerges, as, something, rare, ,, an, issue, m...  \n",
      "...                                                 ...  \n",
      "8525  [any, enjoyment, will, be, hinge, from, a, per...  \n",
      "8526  [if, legendary, shlockmeister, ed, wood, had, ...  \n",
      "8527  [hardly, a, nuanced, portrait, of, a, young, w...  \n",
      "8528  [interminably, bleak, ,, to, say, nothing, of,...  \n",
      "8529  [things, really, get, weird, ,, though, not, p...  \n",
      "\n",
      "[8352 rows x 4 columns]\n",
      "                                                   text  label lang  \\\n",
      "0     lovingly photographed in the manner of a golde...      1   en   \n",
      "1                 consistently clever and suspenseful .      1   en   \n",
      "2     it's like a \" big chill \" reunion of the baade...      1   en   \n",
      "3     the story gives ample opportunity for large-sc...      1   en   \n",
      "4                     red dragon \" never cuts corners .      1   en   \n",
      "...                                                 ...    ...  ...   \n",
      "1061  a terrible movie that some people will neverth...      0   en   \n",
      "1062  there are many definitions of 'time waster' bu...      0   en   \n",
      "1063  as it stands , crocodile hunter has the hurrie...      0   en   \n",
      "1064  the thing looks like a made-for-home-video qui...      0   en   \n",
      "1065  enigma is well-made , but it's just too dry an...      0   en   \n",
      "\n",
      "                                         tokenized_text  \n",
      "0     [lovingly, photographed, in, the, manner, of, ...  \n",
      "1           [consistently, clever, and, suspenseful, .]  \n",
      "2     [it, 's, like, a, ``, big, chill, ``, reunion,...  \n",
      "3     [the, story, gives, ample, opportunity, for, l...  \n",
      "4            [red, dragon, ``, never, cuts, corners, .]  \n",
      "...                                                 ...  \n",
      "1061  [a, terrible, movie, that, some, people, will,...  \n",
      "1062  [there, are, many, definitions, of, 'time, was...  \n",
      "1063  [as, it, stands, ,, crocodile, hunter, has, th...  \n",
      "1064  [the, thing, looks, like, a, made-for-home-vid...  \n",
      "1065  [enigma, is, well-made, ,, but, it, 's, just, ...  \n",
      "\n",
      "[1045 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "# Split into words\n",
    "train_df[\"tokenized_text\"] = train_df[\"text\"].apply(word_tokenize)\n",
    "print(train_df)\n",
    "test_df[\"tokenized_text\"] = test_df[\"text\"].apply(word_tokenize)\n",
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c7221d26",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   text  label lang  \\\n",
      "0     the rock is destined to be the 21st century's ...      1   en   \n",
      "1     the gorgeously elaborate continuation of \" the...      1   en   \n",
      "2                        effective but too-tepid biopic      1   en   \n",
      "3     if you sometimes like to go to the movies to h...      1   en   \n",
      "4     emerges as something rare , an issue movie tha...      1   en   \n",
      "...                                                 ...    ...  ...   \n",
      "8525  any enjoyment will be hinge from a personal th...      0   en   \n",
      "8526  if legendary shlockmeister ed wood had ever ma...      0   en   \n",
      "8527  hardly a nuanced portrait of a young woman's b...      0   en   \n",
      "8528    interminably bleak , to say nothing of boring .      0   en   \n",
      "8529  things really get weird , though not particula...      0   en   \n",
      "\n",
      "                                         tokenized_text  \n",
      "0     [rock, destined, 21st, century, new, conan, go...  \n",
      "1     [gorgeously, elaborate, continuation, lord, ri...  \n",
      "2                        [effective, too-tepid, biopic]  \n",
      "3     [sometimes, like, go, movies, fun, wasabi, goo...  \n",
      "4     [emerges, something, rare, issue, movie, hones...  \n",
      "...                                                 ...  \n",
      "8525  [enjoyment, hinge, personal, threshold, watchi...  \n",
      "8526  [legendary, shlockmeister, ed, wood, ever, mad...  \n",
      "8527  [hardly, nuanced, portrait, young, woman, brea...  \n",
      "8528        [interminably, bleak, say, nothing, boring]  \n",
      "8529  [things, really, get, weird, though, particula...  \n",
      "\n",
      "[8352 rows x 4 columns]\n",
      "                                                   text  label lang  \\\n",
      "0     lovingly photographed in the manner of a golde...      1   en   \n",
      "1                 consistently clever and suspenseful .      1   en   \n",
      "2     it's like a \" big chill \" reunion of the baade...      1   en   \n",
      "3     the story gives ample opportunity for large-sc...      1   en   \n",
      "4                     red dragon \" never cuts corners .      1   en   \n",
      "...                                                 ...    ...  ...   \n",
      "1061  a terrible movie that some people will neverth...      0   en   \n",
      "1062  there are many definitions of 'time waster' bu...      0   en   \n",
      "1063  as it stands , crocodile hunter has the hurrie...      0   en   \n",
      "1064  the thing looks like a made-for-home-video qui...      0   en   \n",
      "1065  enigma is well-made , but it's just too dry an...      0   en   \n",
      "\n",
      "                                         tokenized_text  \n",
      "0     [lovingly, photographed, manner, golden, book,...  \n",
      "1                   [consistently, clever, suspenseful]  \n",
      "2     [like, big, chill, reunion, baader-meinhof, ga...  \n",
      "3     [story, gives, ample, opportunity, large-scale...  \n",
      "4                   [red, dragon, never, cuts, corners]  \n",
      "...                                                 ...  \n",
      "1061  [terrible, movie, people, nevertheless, find, ...  \n",
      "1062  [many, definitions, 'time, waster, movie, must...  \n",
      "1063  [stands, crocodile, hunter, hurried, badly, co...  \n",
      "1064  [thing, looks, like, made-for-home-video, quic...  \n",
      "1065                   [enigma, well-made, dry, placid]  \n",
      "\n",
      "[1045 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "# Remove stopwords and punctuation\n",
    "stop_words = list(stopwords.words('english'))\n",
    "stop_words.append(\"``\")\n",
    "stop_words.append(\"'s\")\n",
    "def remove_stopwords_and_punct(x):\n",
    "    new_list = [w for w in x if w not in stop_words and w not in punctuation]\n",
    "    return new_list\n",
    "train_df[\"tokenized_text\"] = train_df[\"tokenized_text\"].apply(remove_stopwords_and_punct)\n",
    "print(train_df)\n",
    "test_df[\"tokenized_text\"] = test_df[\"tokenized_text\"].apply(remove_stopwords_and_punct)\n",
    "print(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "78fb58b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       text  lang  tokenized_text\n",
      "label                            \n",
      "0      4185  4185            4185\n",
      "1      4167  4167            4167\n",
      "       text  lang  tokenized_text\n",
      "label                            \n",
      "0       523   523             523\n",
      "1       522   522             522\n"
     ]
    }
   ],
   "source": [
    "# Count number of items per class\n",
    "print(train_df.groupby('label').count())\n",
    "print(test_df.groupby('label').count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1d768a02",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save to disk\n",
    "train_df.to_json(\"../data/rotten_tomatoes_train.json\")\n",
    "test_df.to_json(\"../data/rotten_tomatoes_test.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3b91d6eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_stats(word_list, num_words=200):\n",
    "    freq_dist = FreqDist(word_list)\n",
    "    print(freq_dist.most_common(num_words))\n",
    "    return freq_dist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "42d5916d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('film', 685), ('movie', 429), (\"n't\", 286), ('one', 280), ('--', 271), ('like', 208), ('story', 194), ('comedy', 159), ('good', 150), ('even', 144), ('funny', 138), ('way', 135), ('time', 127), ('best', 126), ('characters', 125), ('make', 124), ('life', 124), ('us', 123), ('much', 122), ('love', 118), ('makes', 117), ('performances', 117), ('may', 113), ('work', 111), ('director', 110), ('enough', 105), ('look', 103), ('still', 96), ('little', 94), ('well', 93), ('new', 92), ('films', 92), ('movies', 89), ('fun', 89), ('great', 88), ('drama', 86), ('two', 85), ('performance', 82), ('never', 81), ('could', 80), ('see', 77), ('world', 77), ('people', 76), ('cast', 75), ('many', 74), ('also', 73), ('though', 73), ('tale', 71), ('first', 70), ('documentary', 69), ('without', 69), ('entertaining', 68), ('big', 68), ('made', 67), ('heart', 66), ('ever', 65), ('family', 65), ('often', 64), ('would', 64), ('humor', 64), (\"'re\", 63), ('sense', 63), ('romantic', 61), ('human', 61), ('audience', 60), ('something', 59), ('real', 59), ('american', 59), ('picture', 57), ('cinema', 57), ('yet', 57), ('get', 57), ('take', 56), ('character', 56), ('really', 55), ('compelling', 55), ('thriller', 55), ('far', 53), ('works', 53), ('year', 53), ('kind', 53), ('man', 53), (\"'ll\", 53), ('worth', 52), ('might', 52), ('better', 51), ('moving', 51), ('every', 51), ('music', 50), ('gives', 50), ('part', 50), ('full', 50), ('plot', 50), ('fascinating', 49), ('seen', 49), ('long', 48), ('right', 48), ('moments', 48), ('hollywood', 48), ('always', 47), ('screen', 47), ('young', 47), ('takes', 47), ('feel', 46), ('interesting', 46), ('history', 45), ('times', 44), ('piece', 44), ('experience', 43), ('quite', 43), ('portrait', 43), ('watching', 42), ('manages', 42), ('another', 42), ('want', 41), ('lot', 41), ('bit', 41), ('emotional', 41), ('years', 40), ('sweet', 40), ('go', 39), ('comes', 39), ('give', 39), ('smart', 39), ('almost', 39), ('less', 39), ('original', 39), ('enjoyable', 39), ('solid', 39), ('despite', 39), ('ultimately', 38), ('kids', 38), ('ca', 38), ('beautiful', 38), ('action', 38), ('nothing', 38), ('cinematic', 38), ('keep', 38), ('seems', 38), (\"'ve\", 38), ('offers', 37), ('culture', 37), ('old', 37), ('together', 37), ('back', 37), ('scenes', 37), ('touching', 37), ('women', 37), ('mr', 36), ('intelligent', 36), ('script', 36), ('come', 36), ('rare', 35), ('charming', 35), ('charm', 35), ('rather', 35), ('acting', 35), ('power', 35), ('find', 35), ('end', 35), ('delivers', 35), ('gets', 34), ('powerful', 34), ('hard', 34), ('fans', 34), ('sometimes', 33), ('entertainment', 33), ('true', 33), ('actors', 33), ('engaging', 33), ('art', 33), ('whose', 33), ('study', 33), ('away', 32), ('know', 32), ('beautifully', 32), ('perfect', 32), ('face', 32), ('subject', 32), ('fine', 32), ('thing', 32), ('surprisingly', 32), ('going', 31), ('lives', 31), ('think', 31), ('since', 31), ('journey', 31), ('watch', 31), ('style', 31), ('genre', 31), ('terrific', 31), ('least', 31), ('minutes', 30), ('say', 30), ('hilarious', 30), ('french', 30), ('dark', 30), ('making', 29), ('deeply', 29), ('visual', 29)]\n",
      "[('movie', 642), ('film', 557), (\"n't\", 448), ('like', 354), ('one', 293), ('--', 263), ('story', 189), ('much', 176), ('bad', 173), ('even', 159), ('time', 145), ('good', 143), ('characters', 138), ('little', 137), ('would', 130), ('never', 122), ('comedy', 121), ('enough', 107), ('really', 104), ('nothing', 103), ('make', 102), ('way', 102), ('plot', 99), ('could', 96), ('director', 96), ('makes', 93), ('made', 92), ('something', 90), ('script', 87), ('might', 87), ('every', 87), ('funny', 86), ('may', 85), ('thing', 83), ('get', 83), ('long', 82), (\"'re\", 82), ('action', 81), ('better', 79), ('many', 79), ('feels', 78), ('another', 77), ('movies', 76), ('new', 76), ('well', 75), ('work', 74), ('minutes', 73), ('seems', 71), ('us', 71), ('people', 69), ('ca', 69), ('less', 68), ('see', 68), ('without', 65), ('two', 64), ('audience', 63), ('films', 62), ('character', 62), ('life', 62), ('best', 61), ('big', 61), ('kind', 60), ('hard', 60), ('love', 60), ('rather', 59), ('almost', 58), ('far', 58), ('interesting', 57), ('sense', 57), ('things', 56), ('dull', 56), ('watching', 55), (\"'ve\", 55), ('despite', 54), ('feel', 54), ('picture', 54), ('though', 53), ('ultimately', 53), ('comes', 52), ('end', 51), ('humor', 51), ('material', 50), ('drama', 50), ('quite', 50), ('video', 49), ('lot', 49), ('screen', 49), ('real', 49), ('dialogue', 49), ('seem', 48), ('acting', 48), ('also', 47), ('go', 47), ('hollywood', 47), ('cast', 47), ('full', 47), ('look', 47), ('old', 46), ('silly', 45), ('first', 45), ('point', 45), ('whole', 45), ('boring', 45), (\"'ll\", 45), ('anyone', 45), ('come', 44), ('actors', 44), ('often', 44), ('ever', 44), ('going', 44), ('fun', 44), ('thriller', 44), ('original', 43), ('anything', 43), ('gets', 43), ('still', 43), ('find', 42), ('lack', 42), ('instead', 41), ('back', 41), ('watch', 41), ('title', 41), ('series', 41), ('great', 41), ('predictable', 40), ('worst', 40), ('tale', 40), ('tv', 39), ('last', 39), ('takes', 39), ('yet', 39), ('care', 38), ('performance', 38), ('think', 38), ('scenes', 38), ('plays', 38), ('man', 38), ('say', 38), ('take', 37), ('special', 37), ('moments', 37), ('flick', 37), ('times', 36), (\"'d\", 36), ('mess', 36), ('year', 36), ('seen', 36), ('problem', 36), ('idea', 36), ('laughs', 36), ('least', 36), ('fails', 36), ('since', 36), ('performances', 35), ('everything', 35), ('next', 35), ('trying', 35), ('world', 35), ('subject', 34), ('documentary', 34), ('pretty', 34), ('premise', 34), ('know', 34), ('star', 34), ('right', 34), ('simply', 34), ('probably', 33), ('john', 33), ('tries', 33), ('fact', 33), ('nearly', 33), ('neither', 33), ('high', 33), ('jokes', 33), ('american', 32), ('theater', 32), ('young', 32), ('reason', 32), ('else', 32), ('short', 32), ('away', 32), ('need', 32), ('goes', 31), ('making', 31), ('show', 31), ('cinematic', 31), ('exercise', 30), ('turns', 30), ('set', 30), ('flat', 30), ('interest', 30), ('left', 30), ('likely', 30), ('half', 30), ('direction', 30), ('sometimes', 30), ('actually', 29), ('sort', 29), ('narrative', 29), ('pretentious', 29)]\n"
     ]
    }
   ],
   "source": [
    "# Show most common words\n",
    "positive_train_words = train_df[train_df[\"label\"] == 1].tokenized_text.sum()\n",
    "negative_train_words = train_df[train_df[\"label\"] == 0].tokenized_text.sum()\n",
    "positive_fd = get_stats(positive_train_words)\n",
    "negative_fd = get_stats(negative_train_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c427a393",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
